Final Project - Analyzing Sales Data
Date: 12 September 2024
Author: Chonlaphon Chantararat (Phon lifetofree)
Course: Pandas Foundation
# import data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
df = pd.read_csv("sample-store.csv")
# preview top 5 rows
df.head()
# shape of dataframe
df.shape
(9994, 21)
# see data frame information using .info()
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9994 entries, 0 to 9993
Data columns (total 21 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Row ID 9994 non-null int64
1 Order ID 9994 non-null object
2 Order Date 9994 non-null object
3 Ship Date 9994 non-null object
4 Ship Mode 9994 non-null object
5 Customer ID 9994 non-null object
6 Customer Name 9994 non-null object
7 Segment 9994 non-null object
8 Country/Region 9994 non-null object
9 City 9994 non-null object
10 State 9994 non-null object
11 Postal Code 9983 non-null float64
12 Region 9994 non-null object
13 Product ID 9994 non-null object
14 Category 9994 non-null object
15 Sub-Category 9994 non-null object
16 Product Name 9994 non-null object
17 Sales 9994 non-null float64
18 Quantity 9994 non-null int64
19 Discount 9994 non-null float64
20 Profit 9994 non-null float64
dtypes: float64(4), int64(2), object(15)
memory usage: 1.6+ MB
RangeIndex: 9994 entries, 0 to 9993
Data columns (total 21 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Row ID 9994 non-null int64
1 Order ID 9994 non-null object
2 Order Date 9994 non-null object
3 Ship Date 9994 non-null object
4 Ship Mode 9994 non-null object
5 Customer ID 9994 non-null object
6 Customer Name 9994 non-null object
7 Segment 9994 non-null object
8 Country/Region 9994 non-null object
9 City 9994 non-null object
10 State 9994 non-null object
11 Postal Code 9983 non-null float64
12 Region 9994 non-null object
13 Product ID 9994 non-null object
14 Category 9994 non-null object
15 Sub-Category 9994 non-null object
16 Product Name 9994 non-null object
17 Sales 9994 non-null float64
18 Quantity 9994 non-null int64
19 Discount 9994 non-null float64
20 Profit 9994 non-null float64
dtypes: float64(4), int64(2), object(15)
memory usage: 1.6+ MB
We can use pd.to_datetime() function to convert columns 'Order Date' and 'Ship Date' to datetime.
# example of pd.to_datetime() function
pd.to_datetime(df['Order Date'].head(), format='%m/%d/%Y')
# TODO - convert order date and ship date to datetime in the original dataframe
df['Order Date'] = pd.to_datetime(df['Order Date'], format='%m/%d/%Y')
df['Ship Date'] = pd.to_datetime(df['Ship Date'], format='%m/%d/%Y')
df.head(10)
# TODO - count nan in postal code column
count_na = df['Postal Code'].isna().sum()
count_na
11
# TODO - filter rows with missing values
df_filter = df.dropna()
df_filter
# TODO - Explore this dataset on your owns, ask your own questions
# Question: how many order each date and segment
# count order by order date
df_order_date = df_filter.groupby([ 'Order Date', 'Segment' ]).size().reset_index(name='Count Order')
df_order_date
Data Analysis Part
Answer 10 below questions to get credit from this course. Write pandas code to find answers.
# TODO 01 - how many columns, rows in this dataset
df.shape
# ans: 9994 rows, 21 columns
(9994, 21)
# TODO 02 - is there any missing values?, if there is, which colunm? how many nan values?
df.isna().sum()
# ans: Postal Code 11 values
# TODO 03 - your friend ask for `California` data, filter it and export csv for him
result = df.dropna().query('State == "California"').reset_index()
result.to_csv('result.csv')
# TODO 04 - your friend ask for all order data in `California` and `Texas` in 2017 (look at Order Date), send him csv file
df_state = df.dropna().query('State == "California" | State == "Texas"')
order_year = df_state['Order Date'].dt.year == 2017
df_state[order_year].to_csv('result_orders_2017.csv')
# TODO 05 - how much total sales, average sales, and standard deviation of sales your company make in 2017
order_year = df['Order Date'].dropna().dt.year == 2017
df[order_year]['Sales'].agg([ 'sum', 'mean', 'std' ])
# TODO 06 - which Segment has the highest profit in 2018
order_year = df['Order Date'].dropna().dt.year == 2018
# sum profit by segment
df_profit = df[order_year].groupby('Segment')['Profit'].sum()
# find row has max profit
df_profit.nlargest(1)
# TODO 07 - which top 5 States have the least total sales between 15 April 2019 - 31 December 2019
date_start = pd.to_datetime('2019-04-15')
date_end = pd.to_datetime('2019-12-31')
order_range = (df['Order Date'].dropna() >= date_start) & (df['Order Date'].dropna() <= date_end)
df_order_range = df.dropna().loc[order_range]
# sum sales by state
df_bottom_5 = df_order_range.groupby('State')['Sales'].sum()
# find rows bottom 5 sales
df_bottom_5.nsmallest(5)
# TODO 08 - what is the proportion of total sales (%) in West + Central in 2019 e.g. 25%
order_year = df['Order Date'].dropna().dt.year == 2019
df_2019 = df.loc[order_year]
total_sales_2019 = df_2019['Sales'].sum()
# filter regions 'West' and 'Central'
df_region = df_2019.query(' Region == "West" | Region == "Central" ')
# total sales for 'West' and 'Central'
total_sales_region = df_region['Sales'].sum()
# cal proportion of total sales for 'West' and 'Central'
sales_proportion = (total_sales_region / total_sales_2019) * 100
sales_proportion
54.97479891837763
# TODO 09 - find top 10 popular products in terms of number of orders vs. total sales during 2019-2020
order_years = df['Order Date'].dropna().dt.year.isin([2019, 2020])
df_years = df[order_years]
# group by product name, count orders, and sum sales
df_products = df_years.groupby('Product Name').agg({'Sales': 'sum','Order ID': 'count'}).reset_index()
# rename columns
df_products.columns = ['Product Name', 'Total Sales', 'Number of Orders']
# Sort first by 'Number of Orders' then by 'Total Sales'
df_products = df_products.sort_values(['Number of Orders', 'Total Sales'], ascending=[False, False])
# find top 10 products
df_products.nlargest(10, 'Number of Orders')
# TODO 10 - plot at least 2 plots, any plot you think interesting :)
df_clean = df.copy().dropna()
# check 'Order Date' is in datetime format. If not, convert it back to datetime format.
if df_clean['Order Date'].dtype != np.dtype('datetime64[ns]'):
df_clean['Order Date'] = pd.to_datetime(df_clean['Order Date'], format='%m/%d/%Y')
## ---------- 1st plot ---------- ##
# create a new column for year and month
df_clean['YearMonth'] = df_clean['Order Date'].dt.to_period('M')
df_clean['Year'] = df_clean['Order Date'].dt.year
df_clean['Month'] = df_clean['Order Date'].dt.month
# group by the new 'YearMonth' column and sum sales
df_sale_month = df_clean.groupby([ 'Year', 'Month' ]).agg({'Sales': 'sum'}).reset_index()
df_sale_month['YearMonth'] = df_sale_month[ 'Year' ].astype(str) + "-" + df_sale_month[ 'Month' ].astype(str)
# create the line plot
plt.figure(figsize=(10, 6))
for year in df_clean['Year'].unique():
year_data = df_sale_month[df_sale_month['Year'] == year]
plt.plot(year_data['Month'], year_data['Sales'], label=year)
plt.xlabel('Month')
plt.ylabel('Sales')
plt.title('Year Sales by Month')
plt.legend()
plt.show()
## ---------- 2nd plot ---------- ##
# group by 'Category' column and sum sales
df_sale_category = df_clean.groupby([ 'Year', 'Category' ]).agg({'Sales': 'sum'}).reset_index()
plt.figure(figsize=(10, 6))
width = 0.25 # Set the width of the bars
for i, category in enumerate(df_sale_category['Category'].unique()):
category_data = df_sale_category[df_sale_category['Category'] == category]
plt.bar([x - (width / 2) + (i * width) for x in category_data['Year']], category_data['Sales'], width, label=category)
plt.xlabel('Year')
plt.ylabel('Sales')
plt.title('Year Sales by Category')
plt.xticks(df_sale_category['Year'])
plt.legend()
plt.show()
# TODO Bonus - use np.where() to create new column in dataframe to help you answer your own questions
df_clean = df.copy().dropna()
# check 'Order Date' is in datetime format. If not, convert it back to datetime format.
if df_clean['Order Date'].dtype != np.dtype('datetime64[ns]'):
df_clean['Order Date'] = pd.to_datetime(df_clean['Order Date'], format='%m/%d/%Y')
# select order year in 2017 in 'Category' is 'Technology'
result = np.where((df_clean['Order Date'].dt.year == 2017) & (df_clean['Category'] == 'Technology'))[0]
selected_rows = df.iloc[result]
selected_rows